Coal production in mines 2013

by: Simon


In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error

sns.set()

Cleaned Data

We cleaned the data in the notebook stored in: deliver/Data_Cleaning.ipynb


In [2]:
df = pd.read_csv("../data/cleaned_coalpublic2013.xls", index_col='MSHA ID')
df[['Year', 'Mine_Name']].head()


Out[2]:
Year Mine_Name
MSHA ID
103381 2013 Tacoa Highwall Miner
103404 2013 Reid School Mine
100759 2013 North River #1 Underground Min
103246 2013 Bear Creek
103451 2013 Knight Mine

In [3]:
df.log_production.hist()


Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6700b76110>

Predict the production of coal mines


In [4]:
features = ['Average_Employees',
            'Labor_Hours'
           ]
categoricals = ['Mine_State',
            'Mine_County',
            'Mine_Status',
            'Mine_Type',
            'Company_Type',
            'Operation_Type',
            'Union_Code',
            'Coal_Supply_Region',
           ]
target = 'log_production'

In [5]:
fig = plt.subplots(figsize=(14,8))
sns.set_context('poster')
sns.violinplot(y='Company_Type', x='log_production', data=df,
                  split=True, inner='stick', )
plt.tight_layout()
plt.savefig("../figures/Coal_prediction_company_type_vs_log_production.png")



In [6]:
dummy_categoricals =[]
for categorical in categoricals:
    # print categorical, len(df[categorical].unique())
    # Avoid the dummy variable trap!
    drop_var = sorted(df[categorical].unique())[-1]
    temp_df = pd.get_dummies(df[categorical], prefix=categorical)
    df = pd.concat([df, temp_df], axis=1)
    temp_df.drop('_'.join([categorical, str(drop_var)]), axis=1, inplace=True)
    dummy_categoricals += temp_df.columns.tolist()

Random Forest Regressor


In [7]:
train, test = train_test_split(df, test_size=0.3)

In [8]:
rf = RandomForestRegressor(n_estimators=100, oob_score=True)
rf.fit(train[features + dummy_categoricals], train[target])


Out[8]:
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [9]:
fit = plt.subplots(figsize=(8,8))
sns.regplot(test[target], rf.predict(test[features + dummy_categoricals]), color='green')
plt.xlim(0,22)
plt.xlabel("Actual Production")
plt.ylabel("Predicted Production")
plt.ylim(0,22)
plt.tight_layout()
plt.savefig("../figures/Coal-production-RF-prediction.png")



In [10]:
predicted = rf.predict(test[features + dummy_categoricals])
print "R^2 score:", r2_score(test[target], predicted)
print "Explained Variance score:", explained_variance_score(test[target], predicted)
print "MSE:", mean_squared_error(test[target], predicted)


R^2 score: 0.864185009021
Explained Variance score: 0.865693720201
MSE: 0.693968984262

In [11]:
rf_importances = pd.DataFrame({'name':train[features + dummy_categoricals].columns,
                              'importance':rf.feature_importances_}).sort_values(by='importance', 
                                                                                 ascending=False).reset_index(drop=True)
rf_importances.head(5)


Out[11]:
importance name
0 0.851840 Labor_Hours
1 0.048892 Average_Employees
2 0.013431 Coal_Supply_Region_Powder River Basin
3 0.007176 Mine_Type_Surface
4 0.004422 Mine_Status_Active

Conclusion

amazing conclusion


In [ ]: